In [1]:
import numpy as np
import plotly.express as px
import os
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import umap
from pyensembl import EnsemblRelease
from itertools import product
from Bio.Seq import translate
import pickle

import ibis
ibis.set_backend("duckdb")
ibis.options.interactive = True
from ibis import _
import ibis.selectors as s
import warnings
warnings.filterwarnings('ignore')
/Users/jordanramsdell/mambaforge/envs/ml_ibis/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
  from .autonotebook import tqdm as notebook_tqdm

Goal¶

  • We're going to explore a couple of ways to extract data from Open Target's diseases and transform them into embeddings

Prelude (loading functions for later)¶

In [2]:
def construct_databases(base_loc):
    mappings = {}
    for directory in os.listdir(base_loc):
        if directory.startswith("."):
            continue
        loc = base_loc + "/" + directory
        t = ibis.read_parquet(loc)
        mappings["t_" + directory] = t
    return mappings

# Load parquet databases into local variables
locals().update(construct_databases("../../../data/open_targets/"))


def vectorize_and_embed(docs, n_components=3, use_densmap=False, 
                        metric='euclidean', n_neighbors=15, vectorizer_fun=lambda: CountVectorizer(stop_words='english')):
    counts = vectorizer_fun().fit_transform(docs)
    mapper = umap.UMAP(n_components=n_components, densmap=use_densmap, metric=metric, random_state=42).fit(counts)
    return mapper
    
def construct_scatterplot(df, mapper, hover_name, color=None, hover_data=None):
    embeddings = mapper.embedding_.T
    df["x"], df["y"], df["z"] = embeddings
        
    fig = px.scatter_3d(df, x="x", y="y", z="z", color=color,
                        hover_name=hover_name, hover_data=hover_data)
    fig.update_layout(margin=dict(l=0, r=0, t=0, b=0))
    fig.update_traces(marker=dict(size=2))
    return fig.show()

Data Prep¶

Example: diseases table¶

In [3]:
# Here's what the table looks like
t_diseases
Out[3]:
┏━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
┃ id          ┃ code                                     ┃ dbXRefs                                  ┃ description                                                                      ┃ name                                                 ┃ directLocationIds ┃ obsoleteTerms ┃ parents                          ┃ synonyms                                                                         ┃ ancestors                                  ┃ descendants                             ┃ children                               ┃ therapeuticAreas                          ┃ indirectLocationIds ┃ ontology                                                                         ┃
┡━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
│ string      │ string                                   │ array<string>                            │ string                                                                           │ string                                               │ array<string>     │ array<string> │ array<string>                    │ struct<hasBroadSynonym: array<string>, hasExactSynonym: array<string>, hasNarro… │ array<string>                              │ array<string>                           │ array<string>                          │ array<string>                             │ array<string>       │ struct<isTherapeuticArea: boolean, leaf: boolean, sources: struct<url: string, … │
├─────────────┼──────────────────────────────────────────┼──────────────────────────────────────────┼──────────────────────────────────────────────────────────────────────────────────┼──────────────────────────────────────────────────────┼───────────────────┼───────────────┼──────────────────────────────────┼──────────────────────────────────────────────────────────────────────────────────┼────────────────────────────────────────────┼─────────────────────────────────────────┼────────────────────────────────────────┼───────────────────────────────────────────┼─────────────────────┼──────────────────────────────────────────────────────────────────────────────────┤
│ DOID_7551   │ http://purl.obolibrary.org/obo/DOID_7551 │ ['ICD9:098.89', 'MeSH:D006069', ... +17] │ A primary bacterial infectious disease that is a sexually transmitted infection… │ gonorrhea                                            │ NULL              │ NULL          │ ['EFO_0003955', 'MONDO_0000314'] │ {'hasBroadSynonym': None, 'hasExactSynonym': [...], ... +2}                      │ ['EFO_0000512', 'MONDO_0100336', ... +8]   │ []                                      │ []                                     │ ['MONDO_0100336', 'OTAR_0000017', ... +1] │ NULL                │ {'isTherapeuticArea': False, 'leaf': True, ... +1}                               │
│ EFO_0004254 │ http://www.ebi.ac.uk/efo/EFO_0004254     │ ['NCIt:C34645', 'NCIT:C34645', ... +9]   │ A slowly progressive inflammation of the glomeruli characterized by immune comp… │ membranous glomerulonephritis                        │ NULL              │ NULL          │ ['MONDO_0002462']                │ {'hasBroadSynonym': None, 'hasExactSynonym': [...], ... +2}                      │ ['EFO_1002050', 'EFO_0009690', ... +3]     │ ['MONDO_0013860']                       │ ['MONDO_0013860']                      │ ['EFO_0009690']                           │ NULL                │ {'isTherapeuticArea': False, 'leaf': False, ... +1}                              │
│ EFO_0005189 │ http://www.ebi.ac.uk/efo/EFO_0005189     │ ['SNOMEDCT:74427007']                    │ The respiratory quotient (or RQ or respiratory coefficient), is a dimensionless… │ respiratory quotient                                 │ NULL              │ NULL          │ ['EFO_0005115']                  │ NULL                                                                             │ ['EFO_0001444', 'EFO_0005115']             │ []                                      │ []                                     │ ['EFO_0001444']                           │ NULL                │ {'isTherapeuticArea': False, 'leaf': True, ... +1}                               │
│ EFO_0005853 │ http://www.ebi.ac.uk/efo/EFO_0005853     │ []                                       │ short or long term physiological response of an organism, eg in terms of deposi… │ response to silica exposure                          │ NULL              │ NULL          │ ['GO_0050896']                   │ NULL                                                                             │ ['GO_0050896', 'GO_0008150']               │ []                                      │ []                                     │ ['GO_0008150']                            │ NULL                │ {'isTherapeuticArea': False, 'leaf': True, ... +1}                               │
│ EFO_0006317 │ http://www.ebi.ac.uk/efo/EFO_0006317     │ []                                       │ Any process that results in a change in state or activity of a cell or an organ… │ response to thiopurine                               │ NULL              │ NULL          │ ['GO_0042493']                   │ NULL                                                                             │ ['GO_0050896', 'GO_0008150', ... +1]       │ ['EFO_0007853']                         │ ['EFO_0007853']                        │ ['GO_0008150']                            │ NULL                │ {'isTherapeuticArea': False, 'leaf': False, ... +1}                              │
│ EFO_0007229 │ http://www.ebi.ac.uk/efo/EFO_0007229     │ ['DOID:12053', 'NCIT:C2967', ... +11]    │ An opportunistic mycosis that results_in fungal infection and has_material_basi… │ cryptococcosis                                       │ NULL              │ NULL          │ ['EFO_0001067', 'MONDO_0002312'] │ {'hasBroadSynonym': None, 'hasExactSynonym': [...], ... +2}                      │ ['MONDO_0100336', 'MONDO_0002312', ... +3] │ ['EFO_0007228']                         │ ['EFO_0007228']                        │ ['MONDO_0100336', 'EFO_0005741']          │ NULL                │ {'isTherapeuticArea': False, 'leaf': False, ... +1}                              │
│ EFO_0007391 │ http://www.ebi.ac.uk/efo/EFO_0007391     │ ['DOID:3106', 'MESH:D009349', ... +5]    │ Infections caused by nematode larvae which never develop into the adult stage a… │ Nematoda infectious disease                          │ NULL              │ NULL          │ ['EFO_1001342']                  │ {'hasBroadSynonym': None, 'hasExactSynonym': [...], ... +2}                      │ ['MONDO_0100336', 'EFO_0005741', ... +2]   │ ['EFO_0007154', 'EFO_0007253', ... +23] │ ['EFO_0007253', 'EFO_0007468', ... +1] │ ['MONDO_0100336', 'EFO_0005741']          │ NULL                │ {'isTherapeuticArea': False, 'leaf': False, ... +1}                              │
│ EFO_0008080 │ http://www.ebi.ac.uk/efo/EFO_0008080     │ []                                       │ quantification of the volume of cerebrospinal fluid in the brain, usually throu… │ cerebrospinal fluid volume measurement               │ NULL              │ NULL          │ ['EFO_0006930']                  │ NULL                                                                             │ ['EFO_0005052', 'EFO_0001444', ... +2]     │ ['EFO_0008367']                         │ ['EFO_0008367']                        │ ['EFO_0001444']                           │ NULL                │ {'isTherapeuticArea': False, 'leaf': False, ... +1}                              │
│ EFO_0008167 │ http://www.ebi.ac.uk/efo/EFO_0008167     │ []                                       │ quantification of the amount of interleukin 1 Receptor accessory protein in a s… │ interleukin 1 Receptor accessory protein measurement │ NULL              │ NULL          │ ['EFO_0007937']                  │ NULL                                                                             │ ['EFO_0004747', 'EFO_0001444', ... +1]     │ []                                      │ []                                     │ ['EFO_0001444']                           │ NULL                │ {'isTherapeuticArea': False, 'leaf': True, ... +1}                               │
│ EFO_0008181 │ http://www.ebi.ac.uk/efo/EFO_0008181     │ []                                       │ quantification of the amount of interleukin 23 receptor in a sample              │ interleukin 23 receptor measurement                  │ NULL              │ NULL          │ ['EFO_0007937']                  │ NULL                                                                             │ ['EFO_0004747', 'EFO_0001444', ... +1]     │ []                                      │ []                                     │ ['EFO_0001444']                           │ NULL                │ {'isTherapeuticArea': False, 'leaf': True, ... +1}                               │
│ …           │ …                                        │ …                                        │ …                                                                                │ …                                                    │ …                 │ …             │ …                                │ …                                                                                │ …                                          │ …                                       │ …                                      │ …                                         │ …                   │ …                                                                                │
└─────────────┴──────────────────────────────────────────┴──────────────────────────────────────────┴──────────────────────────────────────────────────────────────────────────────────┴──────────────────────────────────────────────────────┴───────────────────┴───────────────┴──────────────────────────────────┴──────────────────────────────────────────────────────────────────────────────────┴────────────────────────────────────────────┴─────────────────────────────────────────┴────────────────────────────────────────┴───────────────────────────────────────────┴─────────────────────┴──────────────────────────────────────────────────────────────────────────────────┘

Example: baseline gene expression¶

  • The reason we're looking at these is because diseases can have one or more targets associated with them
  • The protein-coding targets have expression values in the "baselineExpression" table
  • We can use this to represent a disease as a gene expression vector by aggregating over the associated targets
In [4]:
# For this demo, only considering protein-coding targets
protein_coding_targets = t_targets.filter(_.biotype == 'protein_coding' and _.go != None)
In [5]:
# Here is what baseline expresions normally looks like
# Note how deeply nested it is
t_baselineExpression
Out[5]:
┏━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
┃ id              ┃ tissues                                                                          ┃
┡━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
│ string          │ array<struct<efo_code: string, label: string, organs: array<string>, anatomical… │
├─────────────────┼──────────────────────────────────────────────────────────────────────────────────┤
│ ENSG00000020219 │ [{...}, {...}, ... +108]                                                         │
│ ENSG00000059588 │ [{...}, {...}, ... +108]                                                         │
│ ENSG00000070182 │ [{...}, {...}, ... +118]                                                         │
│ ENSG00000070366 │ [{...}, {...}, ... +118]                                                         │
│ ENSG00000072071 │ [{...}, {...}, ... +117]                                                         │
│ ENSG00000073536 │ [{...}, {...}, ... +118]                                                         │
│ ENSG00000075290 │ [{...}, {...}, ... +108]                                                         │
│ ENSG00000083454 │ [{...}, {...}, ... +118]                                                         │
│ ENSG00000083782 │ [{...}, {...}, ... +117]                                                         │
│ ENSG00000086200 │ [{...}, {...}, ... +118]                                                         │
│ …               │ …                                                                                │
└─────────────────┴──────────────────────────────────────────────────────────────────────────────────┘
In [6]:
# Here it is unpacked
(t_baselineExpression
 .select("id", _.tissues.unnest())
 .unpack("tissues")
)
Out[6]:
┏━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
┃ id              ┃ efo_code       ┃ label                                        ┃ organs                                           ┃ anatomical_systems                        ┃ rna                                                               ┃ protein                                                                          ┃
┡━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
│ string          │ string         │ string                                       │ array<string>                                    │ array<string>                             │ struct<value: float64, zscore: int32, level: int32, unit: string> │ struct<reliability: boolean, level: int32, cell_type: array<struct<name: string… │
├─────────────────┼────────────────┼──────────────────────────────────────────────┼──────────────────────────────────────────────────┼───────────────────────────────────────────┼───────────────────────────────────────────────────────────────────┼──────────────────────────────────────────────────────────────────────────────────┤
│ ENSG00000020219 │ UBERON_0012249 │ ectocervix                                   │ ['reproductive organ', 'reproductive structure'] │ ['reproductive system']                   │ {'value': 2.0, 'zscore': -1, ... +2}                              │ {'reliability': False, 'level': -1, ... +1}                                      │
│ ENSG00000020219 │ CL_0000235     │ macrophage                                   │ ['immune organ', 'blood']                        │ ['immune system', 'hematopoietic system'] │ {'value': 0.0, 'zscore': -1, ... +2}                              │ {'reliability': False, 'level': -1, ... +1}                                      │
│ ENSG00000020219 │ CL_0000787     │ memory B cell                                │ ['immune organ', 'blood']                        │ ['immune system', 'hematopoietic system'] │ {'value': 0.0, 'zscore': -1, ... +2}                              │ {'reliability': False, 'level': -1, ... +1}                                      │
│ ENSG00000020219 │ CL_0000815     │ regulatory T cell                            │ ['immune organ', 'blood']                        │ ['immune system', 'hematopoietic system'] │ {'value': 0.0, 'zscore': -1, ... +2}                              │ {'reliability': False, 'level': -1, ... +1}                                      │
│ ENSG00000020219 │ UBERON_0000948 │ heart                                        │ ['heart']                                        │ ['circulatory system']                    │ {'value': 0.0, 'zscore': -1, ... +2}                              │ {'reliability': False, 'level': -1, ... +1}                                      │
│ ENSG00000020219 │ UBERON_0001154 │ vermiform appendix                           │ ['intestine', 'colon']                           │ ['digestive system']                      │ {'value': 0.0, 'zscore': -1, ... +2}                              │ {'reliability': False, 'level': -1, ... +1}                                      │
│ ENSG00000020219 │ UBERON_0001876 │ amygdala                                     │ ['brain']                                        │ ['nervous system']                        │ {'value': 0.0, 'zscore': -1, ... +2}                              │ {'reliability': False, 'level': -1, ... +1}                                      │
│ ENSG00000020219 │ UBERON_0002190 │ subcutaneous adipose tissue                  │ ['connective tissue']                            │ ['integumental system']                   │ {'value': 0.0, 'zscore': -1, ... +2}                              │ {'reliability': False, 'level': -1, ... +1}                                      │
│ ENSG00000020219 │ CL_0002618     │ endothelial cell of umbilical vein (resting) │ ['blood']                                        │ ['circulatory system']                    │ {'value': 0.0, 'zscore': -1, ... +2}                              │ {'reliability': False, 'level': -1, ... +1}                                      │
│ ENSG00000020219 │ UBERON_0001873 │ caudate nucleus                              │ ['brain']                                        │ ['nervous system']                        │ {'value': 1.0, 'zscore': -1, ... +2}                              │ {'reliability': False, 'level': -1, ... +1}                                      │
│ …               │ …              │ …                                            │ …                                                │ …                                         │ …                                                                 │ …                                                                                │
└─────────────────┴────────────────┴──────────────────────────────────────────────┴──────────────────────────────────────────────────┴───────────────────────────────────────────┴───────────────────────────────────────────────────────────────────┴──────────────────────────────────────────────────────────────────────────────────┘

Gene Expression to Vector of Anatomical System Expressions¶

  • Using zscore values here, but they are also available as TPMs
In [7]:
# Here's an example where we map targets onto expression vectors
query_baseline = (t_baselineExpression
 .select("id", _.tissues.unnest())
 .unpack("tissues")
 .select(id=_.id, system=_.anatomical_systems[0], zscore=_.rna.zscore)
 .filter(~_.system.isnull()) # Ignore expresison values where system is null
 .group_by(("id", "system"))
 .agg(mean_zscore = _.zscore.mean())
 
 # Pivot and transf
 .mutate(system = _.system.replace(" ", "_"))
 .pivot_wider(id_cols="id", names_from="system", values_from="mean_zscore")
 .drop("sensory_system") # most are null anyway           
)
query_baseline
Out[7]:
┏━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┓
┃ id              ┃ musculoskeletal_system ┃ integumental_system ┃ circulatory_system ┃ renal_system ┃ connective_tissue ┃ hematopoietic_system ┃ hemolymphoid_system ┃ digestive_system ┃ respiratory_system ┃ external_soft_tissue_zone ┃ nervous_system ┃ immune_system ┃ anatomical_junction ┃ endocrine_system ┃ anatomical_wall ┃ reproductive_system ┃
┡━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━┩
│ string          │ float64                │ float64             │ float64            │ float64      │ float64           │ float64              │ float64             │ float64          │ float64            │ float64                   │ float64        │ float64       │ float64             │ float64          │ float64         │ float64             │
├─────────────────┼────────────────────────┼─────────────────────┼────────────────────┼──────────────┼───────────────────┼──────────────────────┼─────────────────────┼──────────────────┼────────────────────┼───────────────────────────┼────────────────┼───────────────┼─────────────────────┼──────────────────┼─────────────────┼─────────────────────┤
│ ENSG00000271127 │                   -1.0 │                -1.0 │          -1.000000 │    -1.000000 │              -1.0 │                 -1.0 │           -1.000000 │        -1.000000 │              -1.00 │                      -1.0 │      -1.000000 │       -1.0000 │                -1.0 │        -1.000000 │            -1.0 │           -1.000000 │
│ ENSG00000229442 │                   -1.0 │                -0.6 │          -1.000000 │    -1.000000 │              -1.0 │                 -1.0 │           -1.000000 │        -1.000000 │              -1.00 │                      -1.0 │      -1.000000 │       -1.0000 │                -1.0 │        -1.000000 │            -1.0 │           -1.000000 │
│ ENSG00000095587 │                    0.5 │                -1.0 │           0.666667 │    -1.000000 │              -1.0 │                 -0.3 │           -0.142857 │        -0.437500 │              -1.00 │                      -1.0 │       2.055556 │       -0.8125 │                 0.0 │        -0.625000 │            -1.0 │           -0.900000 │
│ ENSG00000219257 │                   -1.0 │                -1.0 │          -1.000000 │    -1.000000 │              -1.0 │                 -1.0 │           -1.000000 │        -1.000000 │              -1.00 │                      -1.0 │      -1.000000 │       -1.0000 │                -1.0 │        -1.000000 │            -1.0 │           -1.000000 │
│ ENSG00000158806 │                   -1.0 │                -1.0 │          -0.900000 │    -0.333333 │              -1.0 │                 -0.4 │            0.000000 │        -0.882353 │              -1.00 │                      -1.0 │       1.800000 │       -0.8750 │                -1.0 │         0.222222 │            -1.0 │           -0.750000 │
│ ENSG00000179941 │                    0.5 │                 0.2 │          -0.100000 │     0.000000 │               2.0 │                  0.1 │           -0.571429 │         0.058824 │              -0.25 │                       0.0 │       0.000000 │       -0.9375 │                 0.0 │        -0.375000 │             0.0 │            0.000000 │
│ ENSG00000204685 │                    1.5 │                 0.6 │          -0.222222 │     0.666667 │               0.0 │                  0.2 │           -0.428571 │         0.312500 │               0.00 │                       0.0 │      -0.388889 │       -1.0000 │                 0.0 │         0.000000 │             0.0 │            0.500000 │
│ ENSG00000244165 │                   -0.5 │                 0.2 │          -0.800000 │    -0.333333 │               0.0 │                 -0.7 │            0.285714 │        -0.235294 │              -0.50 │                       0.0 │       0.750000 │       -0.5625 │                 0.0 │        -0.250000 │             0.0 │           -0.083333 │
│ ENSG00000269316 │                   -1.0 │                -1.0 │          -1.000000 │    -1.000000 │              -1.0 │                 -1.0 │           -1.000000 │        -1.000000 │              -1.00 │                      -1.0 │      -1.000000 │       -1.0000 │                -1.0 │        -1.000000 │            -1.0 │           -1.000000 │
│ ENSG00000235049 │                   -1.0 │                -1.0 │          -0.888889 │    -1.000000 │              -1.0 │                 -1.0 │           -1.000000 │        -0.875000 │              -1.00 │                      -1.0 │       0.055556 │       -1.0000 │                -1.0 │        -0.250000 │            -1.0 │           -0.900000 │
│ …               │                      … │                   … │                  … │            … │                 … │                    … │                   … │                … │                  … │                         … │              … │             … │                   … │                … │               … │                   … │
└─────────────────┴────────────────────────┴─────────────────────┴────────────────────┴──────────────┴───────────────────┴──────────────────────┴─────────────────────┴──────────────────┴────────────────────┴───────────────────────────┴────────────────┴───────────────┴─────────────────────┴──────────────────┴─────────────────┴─────────────────────┘

Disease to Vector of Anatomical System Expressions¶

In [8]:
# Finally, we average over the associated target vectors for each disease
disease_expression_query = (t_associationByOverallDirect
 .select("diseaseId", "targetId")
 
 # Join gene expression values to diseases (diseases have one or more associated genes)
 .left_join(query_baseline, _.targetId == query_baseline.id)
 .drop("targetId", "id")
 
 # Average over these
 .group_by("diseaseId")
 .agg(s.across(s.numeric(), _.mean())))
disease_expression_query
Out[8]:
┏━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┓
┃ diseaseId   ┃ musculoskeletal_system ┃ integumental_system ┃ circulatory_system ┃ renal_system ┃ connective_tissue ┃ hematopoietic_system ┃ hemolymphoid_system ┃ digestive_system ┃ respiratory_system ┃ external_soft_tissue_zone ┃ nervous_system ┃ immune_system ┃ anatomical_junction ┃ endocrine_system ┃ anatomical_wall ┃ reproductive_system ┃
┡━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━┩
│ string      │ float64                │ float64             │ float64            │ float64      │ float64           │ float64              │ float64             │ float64          │ float64            │ float64                   │ float64        │ float64       │ float64             │ float64          │ float64         │ float64             │
├─────────────┼────────────────────────┼─────────────────────┼────────────────────┼──────────────┼───────────────────┼──────────────────────┼─────────────────────┼──────────────────┼────────────────────┼───────────────────────────┼────────────────┼───────────────┼─────────────────────┼──────────────────┼─────────────────┼─────────────────────┤
│ EFO_0000588 │              -0.084173 │           -0.093347 │          -0.245621 │    -0.161626 │          0.181818 │            -0.250808 │           -0.134073 │        -0.094947 │          -0.323757 │                 -0.201613 │      -0.406610 │     -0.407513 │           -0.289899 │        -0.429463 │        0.020202 │           -0.299737 │
│ EFO_0004253 │              -0.216625 │           -0.217128 │          -0.331598 │     0.269521 │         -0.261965 │            -0.412594 │           -0.378194 │        -0.148235 │          -0.499790 │                 -0.307305 │      -0.409484 │     -0.565019 │           -0.367758 │        -0.381647 │       -0.176322 │           -0.406633 │
│ EFO_0004254 │              -0.143302 │           -0.205607 │          -0.226618 │    -0.051921 │         -0.084112 │            -0.368224 │           -0.210058 │        -0.121793 │          -0.356957 │                 -0.255452 │      -0.433729 │     -0.454245 │           -0.352025 │        -0.410696 │        0.012461 │           -0.388474 │
│ EFO_0005239 │              -0.250000 │           -0.137143 │          -0.261429 │    -0.204762 │          0.128571 │            -0.447143 │           -0.285714 │        -0.179149 │          -0.327381 │                 -0.028571 │      -0.641100 │     -0.443750 │           -0.457143 │        -0.391468 │        0.257143 │           -0.367619 │
│ EFO_0005243 │               0.045455 │           -0.119192 │          -0.088552 │    -0.333333 │         -0.030303 │            -0.051515 │           -0.112554 │        -0.140931 │          -0.239899 │                 -0.424242 │      -0.301515 │     -0.469697 │           -0.242424 │        -0.442340 │       -0.212121 │           -0.337879 │
│ EFO_0005252 │              -0.165789 │           -0.248421 │          -0.296725 │    -0.163158 │         -0.331579 │            -0.232632 │           -0.245865 │        -0.297252 │          -0.498684 │                 -0.357895 │      -0.106053 │     -0.526316 │           -0.305263 │        -0.419371 │       -0.247368 │           -0.432719 │
│ EFO_0005272 │              -0.187500 │           -0.325000 │          -0.600000 │     0.083333 │         -0.750000 │            -0.800000 │           -0.750000 │        -0.465074 │          -0.875000 │                 -0.375000 │      -0.597222 │     -0.859375 │           -0.625000 │        -0.517361 │       -0.250000 │           -0.568750 │
│ EFO_0005407 │              -0.159231 │           -0.316872 │          -0.360769 │    -0.311282 │         -0.211094 │            -0.412327 │           -0.332308 │        -0.307460 │          -0.512821 │                 -0.410769 │       0.010661 │     -0.555566 │           -0.359014 │        -0.449744 │       -0.235747 │           -0.454124 │
│ EFO_0007885 │              -0.062500 │           -0.250000 │          -0.297222 │    -0.041667 │          1.125000 │            -0.012500 │            0.089286 │        -0.147059 │          -0.468750 │                 -0.625000 │      -0.525000 │      0.148438 │           -0.500000 │        -0.194444 │       -0.375000 │           -0.575000 │
│ EFO_0007893 │               0.200000 │           -0.320000 │          -0.224444 │     0.066667 │          0.200000 │            -0.440000 │           -0.400000 │         0.163971 │           0.350000 │                  0.400000 │       0.107778 │     -0.512500 │           -0.400000 │        -0.619444 │       -0.200000 │           -0.343333 │
│ …           │                      … │                   … │                  … │            … │                 … │                    … │                   … │                … │                  … │                         … │              … │             … │                   … │                … │               … │                   … │
└─────────────┴────────────────────────┴─────────────────────┴────────────────────┴──────────────┴───────────────────┴──────────────────────┴─────────────────────┴──────────────────┴────────────────────┴───────────────────────────┴────────────────┴───────────────┴─────────────────────┴──────────────────┴─────────────────┴─────────────────────┘

Creating system labels¶

  • We're going to use these labels to color our graphs.
  • They represent the anatomical system which is most highly expressed among genes associated with each diseases
    • Not perfectly representative, but helps give us some intuition in terms of how they cluster.
In [9]:
disease_label_by_highest_expressed_system = (disease_expression_query
 .pivot_longer(~s.c("diseaseId"), values_to="expression", names_to="system")
 .group_by("diseaseId")
 .agg(most_expressed_in_system=_.system.argmax(_.expression)))
In [10]:
disease_label_by_highest_expressed_system
Out[10]:
┏━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
┃ diseaseId   ┃ most_expressed_in_system  ┃
┡━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
│ string      │ string                    │
├─────────────┼───────────────────────────┤
│ EFO_0000673 │ connective_tissue         │
│ EFO_0004346 │ connective_tissue         │
│ EFO_0007859 │ integumental_system       │
│ EFO_1000049 │ anatomical_wall           │
│ EFO_1000054 │ hemolymphoid_system       │
│ EFO_1000058 │ hemolymphoid_system       │
│ EFO_1000066 │ anatomical_wall           │
│ EFO_1001478 │ anatomical_wall           │
│ EFO_1001491 │ connective_tissue         │
│ EFO_1001498 │ external_soft_tissue_zone │
│ …           │ …                         │
└─────────────┴───────────────────────────┘

Creating final query¶

In [11]:
query_final = (t_diseases
 .dropna("description") # don't bother if it doesn't have a description
 .select("name", diseaseId="id", desc="description")
 .inner_join(disease_label_by_highest_expressed_system, "diseaseId")
 .inner_join(disease_expression_query, "diseaseId"))
query_final
Out[11]:
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┓
┃ name                                                 ┃ diseaseId   ┃ desc                                                                             ┃ most_expressed_in_system ┃ musculoskeletal_system ┃ integumental_system ┃ circulatory_system ┃ renal_system ┃ connective_tissue ┃ hematopoietic_system ┃ hemolymphoid_system ┃ digestive_system ┃ respiratory_system ┃ external_soft_tissue_zone ┃ nervous_system ┃ immune_system ┃ anatomical_junction ┃ endocrine_system ┃ anatomical_wall ┃ reproductive_system ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━┩
│ string                                               │ string      │ string                                                                           │ string                   │ float64                │ float64             │ float64            │ float64      │ float64           │ float64              │ float64             │ float64          │ float64            │ float64                   │ float64        │ float64       │ float64             │ float64          │ float64         │ float64             │
├──────────────────────────────────────────────────────┼─────────────┼──────────────────────────────────────────────────────────────────────────────────┼──────────────────────────┼────────────────────────┼─────────────────────┼────────────────────┼──────────────┼───────────────────┼──────────────────────┼─────────────────────┼──────────────────┼────────────────────┼───────────────────────────┼────────────────┼───────────────┼─────────────────────┼──────────────────┼─────────────────┼─────────────────────┤
│ gonorrhea                                            │ DOID_7551   │ A primary bacterial infectious disease that is a sexually transmitted infection… │ connective_tissue        │              -0.096844 │           -0.124075 │          -0.269214 │    -0.183644 │          0.082471 │            -0.241006 │           -0.171879 │        -0.138369 │          -0.388618 │                 -0.267145 │      -0.324140 │     -0.419989 │           -0.315517 │        -0.397776 │       -0.165230 │           -0.310319 │
│ respiratory quotient                                 │ EFO_0005189 │ The respiratory quotient (or RQ or respiratory coefficient), is a dimensionless… │ musculoskeletal_system   │               2.000000 │           -0.400000 │          -0.400000 │    -0.666667 │          0.000000 │             0.300000 │            0.000000 │        -0.352941 │          -0.500000 │                  0.000000 │      -0.800000 │      0.437500 │           -1.000000 │        -0.333333 │        0.000000 │            0.083333 │
│ response to silica exposure                          │ EFO_0005853 │ short or long term physiological response of an organism, eg in terms of deposi… │ connective_tissue        │              -0.500000 │            0.266667 │          -0.733333 │    -0.222222 │          0.666667 │             0.266667 │            0.571429 │         0.058824 │          -0.138889 │                 -0.333333 │      -0.750000 │      0.479167 │           -0.666667 │        -0.185185 │        0.333333 │           -0.416667 │
│ response to thiopurine                               │ EFO_0006317 │ Any process that results in a change in state or activity of a cell or an organ… │ hemolymphoid_system      │               0.250000 │            0.250000 │          -0.426389 │     0.083333 │         -0.125000 │            -0.112500 │            0.482143 │         0.209099 │           0.125000 │                 -0.375000 │      -0.402778 │      0.078125 │           -0.875000 │        -0.588542 │       -0.375000 │           -0.741667 │
│ cryptococcosis                                       │ EFO_0007229 │ An opportunistic mycosis that results_in fungal infection and has_material_basi… │ hemolymphoid_system      │              -0.274286 │           -0.309714 │          -0.405206 │    -0.312381 │         -0.468571 │            -0.230857 │           -0.031837 │        -0.155399 │          -0.220000 │                 -0.451429 │      -0.514404 │     -0.271786 │           -0.537143 │        -0.509762 │       -0.091429 │           -0.504450 │
│ Nematoda infectious disease                          │ EFO_0007391 │ Infections caused by nematode larvae which never develop into the adult stage a… │ anatomical_wall          │              -0.345930 │           -0.247674 │          -0.374354 │    -0.184109 │         -0.255814 │            -0.322674 │           -0.178571 │        -0.090544 │          -0.357558 │                 -0.325581 │      -0.578775 │     -0.430596 │           -0.430233 │        -0.452116 │        0.005814 │           -0.487350 │
│ interleukin 1 Receptor accessory protein measurement │ EFO_0008167 │ quantification of the amount of interleukin 1 Receptor accessory protein in a s… │ hematopoietic_system     │              -0.750000 │           -0.500000 │          -0.800000 │    -0.500000 │         -0.500000 │             0.300000 │           -0.357143 │        -0.676471 │          -0.750000 │                 -1.000000 │      -0.925000 │     -0.593750 │           -1.000000 │        -0.555556 │       -1.000000 │           -0.666667 │
│ interleukin 23 receptor measurement                  │ EFO_0008181 │ quantification of the amount of interleukin 23 receptor in a sample              │ hemolymphoid_system      │              -0.500000 │           -0.650000 │          -0.944444 │    -0.666667 │         -0.500000 │            -0.325000 │           -0.035714 │        -0.133272 │          -0.562500 │                 -1.000000 │      -0.805556 │     -0.312500 │           -1.000000 │        -0.315972 │       -0.750000 │           -0.354167 │
│ atypical femoral fracture                            │ EFO_0009960 │ Stress or insufficency fractures occurring in the femoral shaft, typically in r… │ hemolymphoid_system      │              -0.333333 │           -0.133333 │          -0.633333 │    -0.444444 │         -0.666667 │             0.133333 │            0.238095 │        -0.607843 │          -0.750000 │                 -0.333333 │      -0.816667 │      0.125000 │           -0.666667 │        -0.666667 │        0.000000 │           -0.750000 │
│ CD40 measurement                                     │ EFO_0010586 │ quantification of the amount of CD40 in a sample                                 │ digestive_system         │              -0.750000 │            0.500000 │          -0.400000 │     0.333333 │         -0.500000 │            -0.600000 │            0.214286 │         0.705882 │           0.458333 │                  0.500000 │      -0.875000 │     -0.468750 │           -0.500000 │        -0.111111 │        0.000000 │           -0.083333 │
│ …                                                    │ …           │ …                                                                                │ …                        │                      … │                   … │                  … │            … │                 … │                    … │                   … │                … │                  … │                         … │              … │             … │                   … │                … │               … │                   … │
└──────────────────────────────────────────────────────┴─────────────┴──────────────────────────────────────────────────────────────────────────────────┴──────────────────────────┴────────────────────────┴─────────────────────┴────────────────────┴──────────────┴───────────────────┴──────────────────────┴─────────────────────┴──────────────────┴────────────────────┴───────────────────────────┴────────────────┴───────────────┴─────────────────────┴──────────────────┴─────────────────┴─────────────────────┘
In [12]:
df = query_final.to_pandas()

# Adding index (for debugging purposes)
df["index"] = list(range(len(df)))

Mapper Prep / Visualization¶

Disease Descriptions -> Embeddings¶

  • We're going to be using the "desc" field, which is the disease's description
  • The descriptions will be tokenized and represented using the bag-of-words model
  • Using tf–idf via sklearn's TFidfVectorizer
In [13]:
disease_desc_mapper = vectorize_and_embed(df["desc"], 
                                          metric='hellinger',
                                          vectorizer_fun=lambda: TfidfVectorizer(stop_words='english', norm='l1'))

Disease Descriptions -> Visualization¶

  • Color-wise, there is some clustering behavior, but it's not very striking
In [14]:
construct_scatterplot(df, disease_desc_mapper, hover_name="name", 
                      color="most_expressed_in_system", hover_data=["diseaseId", "index"])

Disease Tissue Expression -> Embeddings¶

  • Now we will embed using our expression vectors
In [15]:
expression_vectors = \
(query_final
 .select(s.numeric())
 .fillna(0)
 .execute())

disease_expression_mapper = umap.UMAP(n_components=3, metric="euclidean", random_state=42).fit(expression_vectors)

Disease Tissue Expression -> Visualization¶

  • When we represent diseases using gene expression embeddings, we see more clustering behavior according to coloring
  • This behavior isn't surprising consider how we derived the embeddings (from gene expression in anatomical systems)
In [16]:
construct_scatterplot(df, disease_expression_mapper, hover_name="name", 
                      color="most_expressed_in_system", hover_data=["diseaseId", "index"])

Exporting Mappers (for prediction in another notebook)¶

In [17]:
# Will probably need to look into using an lil_matrix instead...
pickle.dump(disease_expression_mapper, open("models/disease_expression_mapper.sav", 'wb'))
pickle.dump(disease_desc_mapper, open("models/disease_desc_mapper.sav", 'wb'))
pickle.dump(df, open("models/disease_df.sav", 'wb'))
# pickle.dump(disease_names, open("models/disease_names.sav", 'wb'))
In [ ]: